Required libraries
library(tidyverse)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(readxl)
library(fs)
library(RColorBrewer)
Loading data, modifying slightly, and combining
# the manuscripts
allplays <- dir_ls("../data", glob = "*.jsonl") %>%
map_dfr(
function(fn) jsonlite::stream_in(file(fn), verbose = FALSE) %>%
tibble() %>%
mutate(speaker = stringr::str_remove(speaker, " \\(")) %>%
add_column(filename = basename(fn))) %>%
mutate(title_year = paste(title, year))
# an excel sheet with character aliases
aliases <- read_excel("../data/Rolleliste.xlsx") %>%
unnest_tokens(variant, Alias, token = "regex", pattern = ", ") %>%
mutate(
Karakter = tolower(Karakter),
variant = tolower(variant))
aliases
# aliases are used to rename speakers in allplays into their main name
allplays <- allplays %>%
mutate(speaker = tolower(speaker)) %>%
left_join(aliases, by = c("filename"="Filnavn", "speaker"="variant")) %>%
mutate(speaker = if_else(!is.na(Karakter), Karakter, speaker))
# an excel sheet indicating gender and social class of all characters
gender_social <- read_excel("../data/gender_AND_Mask_alaw_sammenlagte_karakterer_reduceret_i_kategoriantal.xlsx") %>%
# change gender codes
mutate(gender = gsub("^f$", "Female", gender)) %>%
mutate(gender = gsub("mp", "m", gender)) %>%
mutate(gender = gsub("^m$", "Male", gender)) %>%
mutate(gender = gsub("mfp", "Mixture", gender))
# combine columns in gender_social
gender_social <- unite(data = gender_social, col = "social_status", "social status, main character", "social status, other characters", sep="", na.rm = TRUE) %>%
mutate(social_status = gsub("^NA", "", social_status)) %>%
mutate(social_status = gsub("NA$", "", social_status))
gender_social
# cluster social statuses not of higher interest
other <- c("Artisans","Extern","Children","Riding bailiff/Proprietor","Church officials","","Military/Law enforcement", "Performers/Artists")
gender_social <- gender_social %>% mutate(social_status = ifelse(social_status %in% other, "Other", social_status))
# combine with allplays
allplays <- allplays %>%
mutate(docTitle = tolower(docTitle)) %>%
left_join(gender_social[,c(1,2,7,8)], by = c("docTitle"="play", "speaker"="speaker"))
allplays
Identifying characters that are present in the stage but silent
# all character names
names <- unique(allplays[!is.na(allplays$speaker),"speaker"])
# NAMES of characters on the stage is recorded in the stage and speaker_stage variables
# tokenize explicit entries in the stage variable
explicit_stage_tokens <- allplays %>%
filter(!is.na(stage)) %>%
filter(!startsWith(stage, "(")) %>%
unnest_tokens(word, stage, drop=FALSE, token="regex", pattern = ", *") %>%
select(docTitle, filename, title, act, scene, index, word) %>%
distinct()
# tokenize implicit (parenthetical) entries in the stage variable
implicit_stage_tokens <- allplays %>%
filter(!is.na(stage)) %>%
filter(startsWith(stage, "(")) %>%
unnest_tokens(word, stage) %>%
select(docTitle, filename, title, act, scene, index, word) %>%
distinct()
# tokenize speaker stage variable
speaker_stage_tokens <- allplays %>%
unnest_tokens(word, speaker_stage) %>%
filter(!is.na(word)) %>%
select(docTitle, filename, title, act, scene, index, word)
# search for names in tokens, i.e., keep the tokens that are names
explicit_names_in_stage <- explicit_stage_tokens %>%
semi_join(names, by = c("word"="speaker"))
implicit_names_in_stage <- implicit_stage_tokens %>%
semi_join(names, by = c("word" = "speaker"))
names_in_speaker_stage <- speaker_stage_tokens %>%
semi_join(names, by = c("word" = "speaker"))
# join the above
all_people_in_stage <- explicit_names_in_stage %>%
full_join(implicit_names_in_stage) %>%
full_join(names_in_speaker_stage)
# the characters who speak in each scene
speakers <- allplays %>%
filter(!is.na(speaker)) %>%
select(filename, act, scene, speaker) %>%
distinct()
# find aliases to speakers within each title, act, and scene
speakers_w_aliases <- data.frame()
for (i in 1:nrow(speakers)){
speaker <- speakers$speaker[i]
main_name <- aliases$Karakter[aliases$variant == speaker]
alias_names <- aliases$variant[aliases$Karakter == main_name]
n <- length(alias_names)
new_rows <- data.frame(
"filename"=rep(speakers$filename[i],n),
"act"=rep(speakers$act[i],n),
"scene"=rep(speakers$scene[i],n),
"speaker"=alias_names)
speakers_w_aliases <- rbind(speakers_w_aliases, new_rows)
}
# add aliases to speakers
speakers <- rbind(speakers, speakers_w_aliases) %>%
distinct()
# filter out speakers from all people on stage within title, act, and scene
silent <- all_people_in_stage %>%
anti_join(speakers, by=c("filename"="filename", "act"="act", "scene"="scene", "word"="speaker")) %>%
distinct()
# aliases are used to rename
silent <- silent %>%
left_join(aliases, by = c("filename"="Filnavn", "word"="variant")) %>%
mutate(speaker = if_else(!is.na(Karakter), Karakter, word))
# add gender and social class
silent <- silent %>%
left_join(gender_social[,c(1,2,7,8)], by = c("docTitle"="play", "speaker"="speaker"))
silent
Preparing a data frame and plotting amount of silent characters
# counting silent characters by gender (mixture and NAs removed) and class
df_silent <- silent %>% count(gender, social_status) %>% filter(!gender %in% c("Mixture", NA))
# count amount of silent characters by gender only
total <- df_silent %>%
group_by(gender) %>% summarise(total=sum(n)) %>%
pull(total)
df_total <- data.frame(gender = c("Female", "Male"),
social_status = rep("Total",2),
n = total)
df_silent <- df_silent %>% full_join(df_total) %>%
mutate(social_status = factor(social_status))
# prepare df as above but both with silent and speaking characters
all_people_in_stage <- all_people_in_stage %>%
left_join(aliases, by = c("filename"="Filnavn", "word"="variant")) %>%
mutate(speaker = if_else(!is.na(Karakter), Karakter, word))
all_people_in_stage <- all_people_in_stage %>%
left_join(gender_social[,c(1,2,7,8)], by = c("docTitle"="play", "speaker"="speaker"))
df_all <- all_people_in_stage %>% count(gender, social_status) %>% filter(!gender %in% c("Mixture", NA))
total2 <- df_all %>%
group_by(gender) %>% summarise(total=sum(n)) %>%
pull(total)
df_total2 <- data.frame(gender = c("Female", "Male"),
social_status = rep("Total",2),
n = total2)
df_all <- df_all %>% full_join(df_total2) %>%
mutate(social_status = factor(social_status))
# compare df_silent with df_all
df_all <- df_all %>% semi_join(df_silent, by = c("gender"="gender", "social_status"="social_status"))
# calculate percentage of silent occurrences
df_silent$percentage <- (df_silent$n/df_all$n)*100
# calculate percentages aggregated across gender
df_silent_agg <- df_silent %>% group_by(social_status) %>%
summarise(gender = "Both", n = sum(n), percentage=sum(percentage))
df_silent <- full_join(df_silent, df_silent_agg) %>% mutate(gender=factor(gender, levels = c("Female", "Male", "Both")))
# plotting
ggplot(df_silent, aes(x=social_status, y=percentage, fill=social_status))+
geom_col()+
scale_fill_manual(values = c(rep("#02818A", 7), "#024f54")) +
scale_y_continuous(expand = c(0,0), limits = c(0,43), breaks = seq(0,40,by=10))+
facet_wrap(.~gender, scales = "free_x")+
labs(x="Social class", y="Percentage of silent occurrences")+
theme_bw()+
theme(panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 40, hjust = 1),
legend.position = "none")

# alternative plot
df_silent %>% filter(gender %in% c("Female", "Male")) %>%
ggplot(aes(x=social_status, y=percentage, fill=gender))+
geom_col()+
geom_text(aes(label = paste0(round(percentage, 1),"%")),
position = position_stack(vjust = 0.9),
color = "white", size = 3)+
scale_fill_manual(values = c("#d6604d", "#4393c3"))+
scale_y_continuous(expand = c(0,0), limits = c(0,43), breaks = seq(0,40,by=10))+
labs(x="Social class", y="Percentage of silent occurrences")+
theme_bw()+
theme(panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 40, hjust = 1),
legend.title = element_blank())

A data frame is prepared for plotting percentages of speech
df <- allplays %>%
# remove rows that are not dialogue
filter(act != "", scene != "", speaker != "", !is.na(speaker), !is.na(spoke)) %>%
# add the number of spoken words
mutate(n_spoken_words = str_count(spoke, '\\w+')) %>%
# organize data set by grouping
group_by(year, title_year, act_number, scene_number, speaker, gender, social_status) %>%
# sum the words spoken by each speaker
summarise(words = sum(n_spoken_words))
# find highest number of scenes within each act for use in the plot
borders <- df %>% group_by(act_number) %>%
summarise(max_scene = max(scene_number)) %>%
mutate(borders = cumsum(max_scene+c(0.5,rep(0,length(act_number)-1)))) %>%
pull(borders)
df <- df %>%
# calculate percentage of words spoken in each scene
group_by(title_year) %>%
mutate(percent = 100*(words/sum(words))) %>% ungroup() %>%
# add act:scene column
mutate(act_scene = paste0(act_number, ":", str_pad(scene_number, 2, pad = "0"))) %>%
# minor correction
mutate(title_year = gsub("ARTAXERXES", "Artaxerxes", title_year)) %>%
# remove plays that are very short
filter(!title_year %in% c("Nytårsprolog til en komedie (1723)", "Den danske komedies ligbegængelse (1746)"))
# make order of plays chronological
play_chronology <- unique(df$title_year)
df$title_year <- factor(df$title_year, levels = play_chronology)
Coloring by gender
df %>% group_by(title_year, act_number, scene_number, act_scene, gender) %>%
summarise(percent = sum(percent)) %>%
mutate(gender = gsub("NA", "Unknown", gender)) %>%
ggplot(aes(fill = gender, y = percent, x = act_scene)) +
geom_bar(stat="identity") +
scale_fill_manual(values = c("#d6604d", "#4393c3", "#82817f", "black")) +
xlab("Act:Scene") +
ylab("Percentage of spoken words") +
facet_grid(rows = vars("title_year" = title_year),
switch = "x", scales = "free_y") +
geom_vline(xintercept = borders, size = 0.2) +
theme_bw() +
theme(legend.position = "bottom",
legend.title = element_blank(),
axis.text.x = element_text(angle = 90, size = 6, hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 4),
strip.text.y = element_text(angle = 0, hjust = 0))

Coloring by social status
df %>% group_by(title_year, act_number, scene_number, act_scene, social_status) %>%
summarise(percent = sum(percent)) %>%
ggplot(aes(fill = social_status, y = percent, x = act_scene)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette = "Set3", direction = -1) +
xlab("Act:Scene") +
ylab("Percentage of spoken words") +
facet_grid(rows = vars("title_year" = title_year),
switch = "x", scales = "free_y") +
geom_vline(xintercept = borders, size = 0.2) +
theme_bw() +
theme(legend.position = "bottom",
legend.title = element_blank(),
axis.text.x = element_text(angle = 90, size = 6, hjust = 0, vjust = 0.5),
axis.text.y = element_text(size = 4),
strip.text.y = element_text(angle = 0, hjust = 0))
